In [1]:
from xml.etree import ElementTree as ET
In [2]:
document_tree = ET.parse( './data/mondial_database_less.xml' )
In [3]:
# print names of all countries
for child in document_tree.getroot():
print child.find('name').text
In [4]:
# print names of all countries and their cities
for element in document_tree.iterfind('country'):
print '* ' + element.find('name').text + ':',
capitals_string = ''
for subelement in element.getiterator('city'):
capitals_string += subelement.find('name').text + ', '
print capitals_string[:-2]
Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find
In [5]:
import pandas as pd
import numpy as np
document = ET.parse( './data/mondial_database.xml' )
root = document.getroot()
In [6]:
country,mortality = [],[]
for c in root.findall('country'):
country.append(c.find('name').text)
try:
mortality.append(float(c.find('infant_mortality').text))
except:
mortality.append(np.nan)
mortalities = pd.concat({'country':pd.Series(country),
'mortality':pd.Series(mortality)},axis=1)
mortalities.sort('mortality',ascending=True).head(10)
Out[6]:
In [7]:
country, city, population = [], [], []
for c in root.findall('./country/city'):
country.append(c.attrib['country'])
city.append(c.find('name').text)
try:
population.append(int(c.find('population').text))
except:
population.append(np.nan)
cityPops = pd.concat({'country':pd.Series(country),
'city':pd.Series(city),
'population':pd.Series(population)},axis=1)
cityPops.sort('population',ascending=False).head(10)
Out[7]:
In [8]:
population, country, year = [], [], []
for c in root:
current = c.find('name').text
for pop in c.findall("population"):
country.append(current)
year.append(int(pop.attrib['year']))
try:
population.append(int(pop.text))
except:
population.append(np.nan)
pops = pd.concat({'country':pd.Series(country),
'population':pd.Series(population),
'year':pd.Series(year)},axis=1)
pops.head(10)
Out[8]:
In [9]:
#ref: http://stackoverflow.com/questions/27488080/
pops = pops.groupby('country').apply(lambda g: g[g.year==g.year.max()])
pops = pops.reset_index(drop=True)
pops.head()
Out[9]:
In [10]:
country,group,percentage = [], [], []
for c in root:
current = c.find('name').text
for pop in c.findall("ethnicgroup"):
country.append(current)
group.append(pop.text)
percentage.append(float(pop.attrib['percentage']))
groups = pd.concat({'country':pd.Series(country),
'name':pd.Series(group),
'percentage':pd.Series(percentage)},axis=1)
groups.head()
Out[10]:
In [11]:
combined = pd.merge(groups,pops,on="country")
combined['subpop'] = (combined.percentage*combined.population)//100
combined.head()
Out[11]:
In [12]:
combined.groupby('name').sum().sort('subpop',ascending=False).subpop.head(10)
Out[12]:
In [13]:
# a)
country, name, length = [], [], []
for r in root.findall('.//river'):
country.append(r.attrib['country'])
name.append(r.find('name').text)
try:
length.append(int(r.find('length').text))
except:
length.append(np.nan)
rivers = pd.concat({'country':pd.Series(country),
'name':pd.Series(name),
'length':pd.Series(length)},axis=1)
rivers.sort('length',ascending=False).head(1)
Out[13]:
In [14]:
# b)
country, name, area = [], [], []
for l in root.findall('.//lake'):
country.append(l.attrib['country'])
name.append(l.find('name').text)
try:
area.append(int(l.find('area').text))
except:
area.append(np.nan)
lakes = pd.concat({'country':pd.Series(country),
'name':pd.Series(name),
'area':pd.Series(area)},axis=1)
lakes.sort('area',ascending=False).head(1)
Out[14]:
In [15]:
# c) (Answer here appears to be a bit out of date)
country, name, elevation = [], [], []
for a in root.findall('.//airport'):
country.append(a.attrib['country'])
name.append(a.find('name').text)
try:
elevation.append(int(a.find('elevation').text))
except:
elevation.append(np.nan)
airports = pd.concat({'country':pd.Series(country),
'name':pd.Series(name),
'elevation':pd.Series(elevation)},axis=1)
airports.sort('elevation',ascending=False).head(1)
Out[15]: